正则表达式简单介绍
正则表达式(regular expression)描述了一种字符串匹配的模式,可以用来检查一个串是否含有某种子串、将匹配的子串做替换或者从某个串中取出符合某个条件的子串等。
字符 [a-z]
数字 [0-9] 或 \b* 匹配前面的子表达式零次或多次
+ 匹配前面的子表达式一次或多次 ? 匹配前面的子表达式零次或一次一个简单爬虫例子
import reimport urllib.requesturl = "http://mall.csdn.net/coin"savePath = "G:/QQData/"def getHtml(url): page = urllib.request.urlopen(url) html = page.read().decode('utf-8') return htmldef getImg(html): reg = r'http:\/\/img.bss.csdn.net\/[0-9]+\.jpg' imgre = re.compile(reg) imglist = re.findall(imgre, html) return imglist;def saveImg(url): conn = urllib.request.urlopen(url) file = open(savePath + getFileName(url), 'wb') file.write(conn.read()) file.close() returndef getFileName(url): reg = r'[a-z0-9]+\.jpg' matchObj = re.search(reg, url) if matchObj: return matchObj.group() returnhtml = str(getHtml(url));imgurls = getImg(html);for imgurl in imgurls: print(imgurl) saveImg(imgurl)
这个例子很简单,里面的正则简单的到没有,是爬取CSDN U币商城的图片,很容易看懂。
可以自定义正则和Url的爬虫
from tkinter import *import reimport urllib.requestsavePath = "G:/QQData/"class ControllPanel(Frame): default_url = "http://www.xingyongshe.com/man/ddfeijibei" default_regix = r' 0: self.setStatus("有%d张图片可以爬取", len(imgurls)) for imgurl in imgurls: print(imgurl) return imgurls def startRun(self): imgurls = self.verifyRegix() runfunction = RunFunction(None, None) size = runfunction.runTask(imgurls) self.setStatus("爬取了%d张图片放在了" + savePath, size) def add_statusbar(self): statusBarFrame = Frame(self.master) statusBarFrame.pack(side=BOTTOM, fill=X) self.status_label = Label(statusBarFrame, bd=1, relief=SUNKEN, anchor=W) self.status_label.pack(fill=X) def setStatus(self, format, *args): if self.status_label == None: return self.status_label.config(text=format % args) self.status_label.update_idletasks() return def clearStatus(self): self.status_label.config(text="") self.status_label.update_idletasks()class RunFunction: def __init__(self, url, regix): self.url = url self.regix = regix def getHtml(self, url): page = urllib.request.urlopen(url) html = page.read().decode('utf-8') return html def getImg(self, html, reg): imgre = re.compile(reg) imglist = re.findall(imgre, html) return imglist; def saveImg(self, url): conn = urllib.request.urlopen(url) file = open(savePath + self.getFileName(url), 'wb') file.write(conn.read()) file.close() return def getFileName(self, url): reg = r'[a-z0-9]+\.jpg' matchObj = re.search(reg, url) if matchObj: return matchObj.group() return def verifyRegix(self): html = str(self.getHtml(self.url)) imgurls = self.getImg(html, self.regix) return imgurls def runTask(self): imgurls = self.verifyRegix(self.url, self.regix) for imgurl in imgurls: self.saveImg(imgurl) return len(imgurls) def runTask(self, imgurls): for imgurl in imgurls: self.saveImg(imgurl) return len(imgurls)root = Tk()root.title("爬虫管理窗口")#让窗口居中显示scnWidth, scnHeight = root.maxsize()tmpcnf = '%dx%d+%d+%d'%(308, 101, (scnWidth-308)/2, (scnHeight-101)/2)root.geometry(tmpcnf)root.maxsize(600, 300)root.minsize(360, 220)#root.resizable(False, False) #让窗口尺寸不变controllPanel = ControllPanel(root)controllPanel.setStatus("等待爬取……")root.mainloop()root.destroy()